# coding=utf-8
from bs4 import BeautifulSoup
import re
import urlparse
import html_downloader


class HtmlParser(object):
    def __init__(self):
        self.downloader = html_downloader.HtmlDownloader()

    def _get_new_urls(self, soup):  # 在soup里获取页面新的urls
        new_urls = set()  # 获取到新的url列表的集合
        page_url = "https://nvd.nist.gov/"
        links = soup.find_all('ul', class_='list-inline')
        for link in links:
            new_url = link.find('a')['href']
            new_full_url = urlparse.urljoin(page_url, new_url)
            print 'craw : %s' % (new_full_url)
            html_cont1 = self.downloader.download(new_full_url)
            soup1 = BeautifulSoup(html_cont1, 'html.parser', from_encoding='utf-8')
            page_url1 = "https://nvd.nist.gov/"
            links1 = soup1.find_all('span', class_='col-md-2')
            for link1 in links1:
                new_url1 = link1.find('a')['href']
                new_full_url1 = urlparse.urljoin(page_url, new_url1)
                html_cont2 = self.downloader.download(new_full_url1)
                soup2 = BeautifulSoup(html_cont2, 'html.parser', from_encoding='utf-8')
                data = {}
                id = soup2.find('title').text
                id1 = id.replace('\t', '').replace('\n', '').replace(' ', '')
                release_time = soup2.find('dd',attrs={'data-testid':'vuln-published-on'}).text
                modified_time = soup2.find('dd',attrs={'data-testid':'vuln-last-modified-on'}).text
                source = soup2.find('dd',attrs={'data-testid':'vuln-source'}).text
                description = soup2.find('p', attrs={'data-testid':'vuln-description'}).text
                data['id'] = id1
                data['release_time'] = release_time
                data['modified_time'] = modified_time
                data['source'] = source
                data['description'] = description
                print 'id1:%s,release_time :%s,modified_time:%s,source:%s,description:%s' % (id1,release_time,modified_time,source,description)


    def _get_new_urlss(self, soup):
        new_urls = set()
        page_url = "https://nvd.nist.gov/"
        links = soup.find_all('span', class_='col-md-2')
        for link in links:
            new_url = link.find('a')['href']
            new_full_url = urlparse.urljoin(page_url, new_url)
            new_urls.add(new_full_url)
        return new_urls

    def _get_data(self, soup):
        data = {}
        id = soup.find('title')
        print id

    def parse(self, html_cont):
        soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
        new_urls = self._get_new_urls(soup)
        return new_urls

    def parse1(self, html_cont):
        soup = soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
        new_urls = self._get_new_urlss(soup)
        return new_urls

    def parse2(self, html_cont):
        soup = soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
        data = self._get_data(soup)
        return data
